Part 2-1-A

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
set.seed(1122)

adult_train<- read.csv('adult-train.csv', 
                           header = TRUE)
adult_test<- read.csv('adult-test.csv', header = TRUE)


filterTrainData <- which(adult_train$occupation == "?" | adult_train$native_country == "?" | adult_train$workclass == "?")
adult_train.df <- adult_train %>% filter(row_number() %in% filterTrainData)


filterTestData <- which(adult_test$occupation == "?" | adult_test$native_country == "?" | adult_test$workclass == "?")
adult_test.df <- adult_test %>% filter(!row_number() %in% filterTestData)


adult_train.df
adult_test.df

Part 2-1-B

library(rpart)
library(rpart.plot)
model <- rpart(income ~ ., data=adult_train, method="class")
print(model)
## n= 32560 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 32560 7841 <=50K (0.75918305 0.24081695)  
##    2) relationship=Not-in-family,Other-relative,Own-child,Unmarried 17799 1178 <=50K (0.93381651 0.06618349)  
##      4) capital_gain< 7073.5 17481  872 <=50K (0.95011727 0.04988273) *
##      5) capital_gain>=7073.5 318   12 >50K (0.03773585 0.96226415) *
##    3) relationship=Husband,Wife 14761 6663 <=50K (0.54860782 0.45139218)  
##      6) education=10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,HS-grad,Preschool,Some-college 10329 3456 <=50K (0.66540807 0.33459193)  
##       12) capital_gain< 5095.5 9807 2944 <=50K (0.69980626 0.30019374) *
##       13) capital_gain>=5095.5 522   10 >50K (0.01915709 0.98084291) *
##      7) education=Bachelors,Doctorate,Masters,Prof-school 4432 1225 >50K (0.27639892 0.72360108) *
rpart.plot(model, extra=104, fallen.leaves=T, type=4, main="Income Train Dataset Decision Tree")

Part 2-1-B-i

print(model$variable.importance[1:3])
##   relationship marital_status   capital_gain 
##       2394.689       2356.078       1031.296
paste("The  top three important predictors in the model are: relationship, marital_status, capital_gain")
## [1] "The  top three important predictors in the model are: relationship, marital_status, capital_gain"

Part 2-1-B-ii

# The first split is done on relationship. The predicted class of the first node is "<=50K".
# The distribution of observations between the “<=50K” and “>50K” classes at first node are 22651 and 7510

Part 2-1-C

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
predict.df <- predict(model, adult_test, type="class")
confusionmat_table <- table(predict.df, adult_test$income)
confusionmat <- confusionMatrix(predict.df, as.factor(adult_test$income))
confusionmat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11805  1901
##      >50K    630  1945
##                                           
##                Accuracy : 0.8445          
##                  95% CI : (0.8389, 0.8501)
##     No Information Rate : 0.7638          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5137          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9493          
##             Specificity : 0.5057          
##          Pos Pred Value : 0.8613          
##          Neg Pred Value : 0.7553          
##              Prevalence : 0.7638          
##          Detection Rate : 0.7251          
##    Detection Prevalence : 0.8418          
##       Balanced Accuracy : 0.7275          
##                                           
##        'Positive' Class : <=50K           
## 

Part 2-1-C-i

sensitivity.df <- sensitivity(confusionmat_table)
specificity.df <- specificity(confusionmat_table)
balanced.df <- ( sensitivity.df + specificity.df )/2
paste("The balanced accuracy is: ", round((balanced.df), digits = 3))
## [1] "The balanced accuracy is:  0.728"

Part 2-1-C-ii

balanced_errrate.df <- (1 - balanced.df)
paste("The balanced error rate:", round((balanced_errrate.df), digits=3))
## [1] "The balanced error rate: 0.272"

Part 2-1-C-iii

# Balanced accuracy is a metric we can use to assess the performance of a classification model. 
# It is calculated as Balanced accuracy = (Sensitivity + Specificity) / 2
# Sensitivity: The “true positive rate” – the percentage of positive cases the model is able to detect
# Specificity: The “true negative rate” – the percentage of negative cases the model is able to detect
# This metric is particularly useful when the two classes are imbalanced – that is, one class appears much more than the other

Part 2-1-C-iv

library(ROCR)
# AUC - AUC stands for "Area under the ROC Curve." That is, AUC measures the entire two-dimensional area underneath the entire ROC curve from (0,0) to (1,1).
# ROC CURVE - ROC (Receiver Operator Characteristic Curve) can help in deciding the best threshold value. A ROC curve is plotted with FPR on the X-axis and TPR on the y-axis.
pred.rocr <- predict(model, newdata=adult_test, type="prob")[,2]
roc.pred <- prediction(pred.rocr, adult_test$income)
roc.perf <- performance(roc.pred, "tpr", "fpr")
auc <- performance(roc.pred, measure = "auc")
paste("AUC: ", round((auc@y.values[[1]]), digits = 3))
## [1] "AUC:  0.845"
plot(roc.perf, colorize=T, lwd=3)
abline(0,1)

### Part 2-1-D

base_acc <- mean(predict.df == adult_test$income)
printcp(model)
## 
## Classification tree:
## rpart(formula = income ~ ., data = adult_train, method = "class")
## 
## Variables actually used in tree construction:
## [1] capital_gain education    relationship
## 
## Root node error: 7841/32560 = 0.24082
## 
## n= 32560 
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.126387      0   1.00000 1.00000 0.0098398
## 2 0.064022      2   0.74723 0.74723 0.0088402
## 3 0.037495      3   0.68320 0.68320 0.0085321
## 4 0.010000      4   0.64571 0.64571 0.0083394
plotcp(model)

paste('The optimal CP value is', model$cptable[which.min(model$cptable[,"xerror"])])
## [1] "The optimal CP value is 0.01"
model_prune <- prune(model, cp=0.01)
predict_prune <- predict(model_prune, adult_test, type="class")
prune_acc <- mean(predict_prune == adult_test$income)
prune_data <- data.frame(base_acc, prune_acc)
prune_data
# The accuracy of the model on the test data is equal when the tree is pruned, so there won't be any benefit of pruning on this model. 

Part 2-1-E-i

set.seed(1122)
train_table <- table(adult_train$income)
# By looking at the summary of the model the root node has 75.11% of training data as "<=50k" and 24.899% of training data as ">50K"
paste("The total number of observations are in the class <=50K: ", train_table[1])
## [1] "The total number of observations are in the class <=50K:  24719"
paste("The total number of observations are in the class >50K: ", train_table[2])
## [1] "The total number of observations are in the class >50K:  7841"

Part 2-1-E-ii

set.seed(1122)
lincome_train <- which(adult_train$income == "<=50K")
gincome_train <- which(adult_train$income == ">50K")
sample <-  sample(lincome_train, size=length(gincome_train))
new_adtrain <- adult_train[c(sample, gincome_train),]
new_adtrain
table(new_adtrain$income)
## 
## <=50K  >50K 
##  7841  7841

Part 2=1-E-iii

library(rpart)
library(rpart.plot)
new_model <- rpart(income ~ ., data=new_adtrain, method="class")
new_predictor <- predict(new_model, adult_test, type="class")
new_confusion_mattable <- table(new_predictor, adult_test$income)
new_confusion_mat <- confusionMatrix(new_predictor, as.factor(adult_test$income))
new_confusion_mat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K >50K
##      <=50K  9727  660
##      >50K   2708 3186
##                                           
##                Accuracy : 0.7931          
##                  95% CI : (0.7868, 0.7993)
##     No Information Rate : 0.7638          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5158          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7822          
##             Specificity : 0.8284          
##          Pos Pred Value : 0.9365          
##          Neg Pred Value : 0.5405          
##              Prevalence : 0.7638          
##          Detection Rate : 0.5974          
##    Detection Prevalence : 0.6380          
##       Balanced Accuracy : 0.8053          
##                                           
##        'Positive' Class : <=50K           
## 

Part 2-1-E-iii-i

new_sensitivity <- sensitivity(new_confusion_mattable) 
new_specificity <- specificity(new_confusion_mattable)
new_balanced <- ( new_sensitivity + new_specificity )/2
paste("The balanced accuracy is: ", round((new_balanced), digits = 3))
## [1] "The balanced accuracy is:  0.805"

Part 2-1-E-iii-ii

new_balanced_error_rate <- (1 - new_balanced)
paste("The balanced error rate:", round((new_balanced_error_rate), digits=3))
## [1] "The balanced error rate: 0.195"

Part 2-1-E-iii-iii

# Balanced accuracy is a metric we can use to assess the performance of a classification model
# It is calculated as: Balanced accuracy = (Sensitivity + Specificity) / 2
# Sensitivity: The “true positive rate” – the percentage of positive cases the model is able to detect
# Specificity: The “true negative rate” – the percentage of negative cases the model is able to detect
# This metric is particularly useful when the two classes are imbalanced – that is, one class appears much more than the other

Part 2-1-E-iii-iv

# AUC - AUC stands for "Area under the ROC Curve." That is, AUC measures the entire two-dimensional area underneath the entire ROC curve from (0,0) to (1,1).
# ROC CURVE - ROC (Receiver Operator Characteristic Curve) can help in deciding the best threshold value. A ROC curve is plotted with FPR on the X-axis and TPR on the y-axis.

new_pred.rocr <- predict(new_model, newdata=adult_test.df, type="prob")[,2]
new_roc.pred <- prediction(new_pred.rocr, adult_test.df$income)
new_roc.perf <- performance(new_roc.pred, "tpr", "fpr")
new_auc <- performance(new_roc.pred, measure = "auc")
paste("AUC: ", round((new_auc@y.values[[1]]), digits = 3))
## [1] "AUC:  0.845"
plot(new_roc.perf, colorize=T, lwd=3)
abline(0,1)

Part 2-1-F

# The balanced accuracy, sensitivity, specificity, positive predictive value and AUC of the model used in 2.1 (c)
# Sensitivity                  : 0.949        
# Specificity                  : 0.505         
# Positive Predictive Value    : 0.861
# Balanced Accuracy            : 0.727
# AUC                          : 0.843
# Balanced Error Rate          : 0.275
# The balanced accuracy, sensitivity, specificity, positive predictive value and AUC of the model used in 2.1 (e)
# Sensitivity                  : 0.782         
# Specificity                  : 0.828         
# Positive Predictive Value    : 0.936
# Balanced Accuracy            : 0.805
# AUC                          : 0.815
# Balanced Error Rate          : 0.195

# sensitivity and specificity will always be inversely related (i.e., one increases as the other decreases). With the balanced data using under sampling on the model (e) the specificity has been increased whereas sensitivity has been decreased, but the overall balanced accuracy has been increased, ppv value increased.